clear all
global system "linux"

if "${system}" == "linux" {
	global code "/"
	global s "/"
}

run "${code}${s}_set-path.do"

*** Select zip_codes  that belong to high-cost county 
use "cll_arra.dta", clear
drop if inlist(state_id, "PR", "AK", "HI", "VI")

rename fipscode county
merge 1:m county using "ZIP_COUNTY_032010.dta", keep(master matched) nogen keepusing(prop_zip_code res_ratio)

sort prop_zip_code county 
by prop_zip_code: egen highcost_ratio = total(res_ratio * (cll > 517000))
by prop_zip_code: egen cll_min = min(cll)
by prop_zip_code: egen cll_max = min(cll)

keep if cll_max > 467000

keep prop_zip_code highcost_ratio state_id cll_m*
duplicates drop prop_zip_code, force


// merge with lps
merge 1:m prop_zip_code using "${embsRaw}/lps-hibal.dta", keep(matched) nogen 
// Mcdash loan origination file merged to the performance file. But I only pulled the first record for each combination of loan id and investor type. "min_date" represents the date of the first record of each combination. 

gen temp = date(orig_date, "YMD")
drop orig_date
rename temp orig_date

gen year = year(orig_date)
gen orig_ym = mofd(orig_date)

// sample selection
*keep if year <= 2013
keep if loan_to_value <= 80

local cutoff = 417000/.8
keep if inrange(appraisal_amount, `cutoff' - 2.5e+5, `cutoff' + 2.5e+5)


// only GSE loans
keep if inlist(investor_type_id, "2", "3")==1
sort mcdash_id min_date
by mcdash_id: keep if _n==_N

gen agency_id = 1 if investor_type_id == "2" // fannie
replace agency_id = 2 if investor_type_id == "3" // freddie


// integer-valued ltv
gen orig_loan_to_value = ceil(loan_to_value)

// renaming
rename current_int_rate orig_rate 
rename orig_amount orig_loan_amount

// termination date
gen temp = date(termination_date, "YMD")
drop termination_date
rename temp termination_date
format termination_date %td

// random loan id
egen mcdash_id = group(mcdash_id)
drop mcdash_id

save "${temp}/lps-prep.dta", replace



